# -*- coding:utf-8 -*-
import re

import urllib
import urllib.request


import gzip
# 处理页面标签类


class Tool:
    """处理页面标签类"""

    # 去除img标签，7为长空格
    removeImg = re.compile('<img.*?>| {7}|')
    # 删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    # 把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    # 将表格制表<td>替换为\t
    replaceTD = re.compile('<td>')
    # 把段落开头换为\n加空两格
    replacePara = re.compile('<p.*?>')
    # 将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    # 将其余标签剔除
    removeExtraTag = re.compile('<.*?>')

    def replace(self, x):
        x = re.sub(self.removeImg, "", x)
        x = re.sub(self.removeAddr, "", x)
        x = re.sub(self.replaceLine, "\n", x)
        x = re.sub(self.replaceTD, "\t", x)
        x = re.sub(self.replacePara, "\n  ", x)
        x = re.sub(self.replaceBR, "\n", x)
        x = re.sub(self.removeExtraTag, "", x)
        # strip()将前后多余内容删除
        return x.strip()

# 处理页面标签类


class Tool:
    # 去除img标签,7位长空格
    removeImg = re.compile('<img.*?>| {7}|')
    # 删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    # 把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    # 将表格制表<td>替换为\t
    replaceTD = re.compile('<td>')
    # 把段落开头换为\n加空两格
    replacePara = re.compile('<p.*?>')
    # 将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    # 将其余标签剔除
    removeExtraTag = re.compile('<.*?>')

    def replace(self, x):
        x = re.sub(self.removeImg, "", x)
        x = re.sub(self.removeAddr, "", x)
        x = re.sub(self.replaceLine, "\n", x)
        x = re.sub(self.replaceTD, "\t", x)
        x = re.sub(self.replacePara, "\n    ", x)
        x = re.sub(self.replaceBR, "\n", x)
        x = re.sub(self.removeExtraTag, "", x)
        # strip()将前后多余内容删除
        return x.strip()


class BDTB:
    """百度贴吧"""
    # 初始化，传入基地址，是否只看楼主

    def __init__(self, baseUrl, seeLZ):
        self.baseUrl = baseUrl
        self.seeLZ = '?see_lz=' + str(seeLZ)
        self.tool = Tool()

    # 获得页面html代码

    def getPage(self, pageNum):
        try:
            url = self.baseUrl + self.seeLZ + '&pn=' + str(pageNum)
            print(url)
            request = urllib.request.Request(url)
            response = urllib.request.urlopen(request)
            data = response.read().decode('utf-8')  # bytes -> str
            print(type(data))
            # print(type(data))
            # data = self.ungzip(data)
            # self.saveFile(data)
            return data
        except Exception as e:
            print('连接百度贴吧失败，错误原因:' + e)
            return None

    # 保存文件

    def saveFile(self, data):
        data = data.encode('utf-8')  # str -> bytes
        savePath = 'E:\\python\\python\\bdtb.out'
        f_obj = open(savePath, 'wb')
        f_obj.write(data)
        f_obj.close()

    # 获得帖子标题

    def getTitle(self):
        page = self.getPage(1)
        pattern = re.compile(
            '<h3 class="core_title_txt pull-left text-overflow  ".*?>(.*?)</h3>', re.S)
        result = re.search(pattern, page)
        if result:
            return result.group(1).strip()
        else:
            return None

    # 提取帖子页数

    def getPageNum(self):
        page = self.getPage(1)
        pattern = re.compile('共<span class="red">(.*?)</span>', re.S)
        result = re.search(pattern, page)
        if result:
            return result.group(1).strip()
        else:
            return None

    # 提取正文内容
    def getContent(self, page):
        divStr = '<div id="post_content.*?".*?>(.*?)</div>'
        pattern = re.compile(divStr, re.S)
        items = re.findall(pattern, page)
        floor = 1
        for item in items:
            print(str(floor) + "楼-------------\n")
            print(self.tool.replace(item))
            floor += 1


baseUrl = 'http://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseUrl, 1)
page = bdtb.getPage(1)
bdtb.getContent(page)
